Set up options

Install new packages

Load packages

Specify file directories

directory <- "/cloud/project" # Class option when coding on RStudio Cloud
# Will need to update if working on personal computer

Load data, remake useful variables

# Check the file path
file.path(directory, "nhanes3.rda")
## [1] "/cloud/project/nhanes3.rda"
# Load the saved R data
load(file.path(directory, "nhanes3.rda"))


# Remake a few variables from last class if they are no longer in your environment
sex1 <- factor(nhanes$sex, levels = c(1, 2), labels = c("male", "female"))
AGE5b <- cut(nhanes$age, quantile(nhanes$age, c(0, .2, .4, .6, .8, 1)), include.lowest = T) # quintiles
AGE5c <- cut(nhanes$age, breaks = c(19, 40, 50, 60, 70, 90))
age5c <- unclass(AGE5c)
nhanes <- cbind(nhanes, sex1, AGE5b, AGE5c, age5c)


# Clean up the dataset. Make sure NaN are NA for key covariates
table(nhanes$educ, useNA = "always")
## 
##    1    2    3  NaN <NA> 
## 2032 2349  660   33    0
nhanes$educ[is.nan(nhanes$educ)] <- NA
table(nhanes$educ, useNA = "always")
## 
##    1    2    3 <NA> 
## 2032 2349  660   33
table(nhanes$alc, useNA = "always")
## 
##    0    1  NaN <NA> 
## 2753 2189  132    0
nhanes$alc[is.nan(nhanes$alc)] <- NA
table(nhanes$alc, useNA = "always")
## 
##    0    1 <NA> 
## 2753 2189  132

6.1. Linear Models: Association between systolic blood pressure (SBP) and blood lead (bpb)

## Does the distribution of log(sbp) look closer to the normal distribution?
par(mfrow = c(2, 1))
hist(nhanes$sbp, nclass = 20, col = "darkorchid")
hist(log(nhanes$sbp), nclass = 20, col = "seagreen3")

## Let's start with non-log transformed SBP first. Look at bivariate association between sbp and continuous covariates
par(mfrow = c(1, 1))
plot(nhanes$age, nhanes$sbp, pch = 20, cex = 0.7, col = "dodgerblue", cex.lab = 1.2, las = 1, ylab = "SBP", xlab = "Age (years)")
lines(smooth.spline(nhanes$age, nhanes$sbp, df = 10), col = "dodgerblue", lwd = 3)

plot(nhanes$bmi, nhanes$sbp, pch = 20, cex = 0.7, col = "dodgerblue", cex.lab = 1.2, las = 1, ylab = "SBP", xlab = "BMI (kg/m2)")
lines(smooth.spline(na.omit(nhanes$bmi), nhanes$sbp[na.omit(nhanes$bmi)], df = 3, ), col = "grey60", lwd = 3)

plot(nhanes$bpb, nhanes$sbp, pch = 20, cex = 0.7, col = "dodgerblue", cex.lab = 1.2, las = 1, ylab = "SBP", xlab = "Blood lead level (ug/dL)")
lines(smooth.spline(nhanes$bpb, nhanes$sbp, df = 10), col = "grey60", lwd = 3)

Simple linear regression

## Start creating a simple regression model for systolic blood pressure,
## including only blood lead (bpb) (crude model).
sbp.model <- lm(sbp ~ bpb, na.action = na.omit, data = nhanes)
sbp.model
## 
## Call:
## lm(formula = sbp ~ bpb, data = nhanes, na.action = na.omit)
## 
## Coefficients:
## (Intercept)          bpb  
##     121.666        1.162
summary(sbp.model) # Is blood lead associated with SBP? Is this the direction you would expect?
## 
## Call:
## lm(formula = sbp ~ bpb, data = nhanes, na.action = na.omit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -51.586 -13.964  -3.757  10.612 114.521 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 121.66581    0.43552  279.36   <2e-16 ***
## bpb           1.16166    0.08528   13.62   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.6 on 5072 degrees of freedom
## Multiple R-squared:  0.03529,    Adjusted R-squared:  0.0351 
## F-statistic: 185.6 on 1 and 5072 DF,  p-value: < 2.2e-16
summary.aov(sbp.model)
##               Df  Sum Sq Mean Sq F value Pr(>F)    
## bpb            1   71318   71318   185.6 <2e-16 ***
## Residuals   5072 1949387     384                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(sbp.model)
## Analysis of Variance Table
## 
## Response: sbp
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## bpb          1   71318   71318  185.56 < 2.2e-16 ***
## Residuals 5072 1949387     384                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Add age in the model
sbp.model1 <- lm(sbp ~ bpb + age, na.action = na.omit, data = nhanes)
summary(sbp.model1) # Does anything change with age in the model?
## 
## Call:
## lm(formula = sbp ~ bpb + age, data = nhanes, na.action = na.omit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -62.405 -10.431  -1.427   8.470 101.240 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 95.00660    0.63062 150.655  < 2e-16 ***
## bpb          0.46641    0.07063   6.604 4.41e-11 ***
## age          0.60338    0.01181  51.077  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.93 on 5071 degrees of freedom
## Multiple R-squared:  0.363,  Adjusted R-squared:  0.3628 
## F-statistic:  1445 on 2 and 5071 DF,  p-value: < 2.2e-16

Add categorical variables linear regression

## Add race, which should be a categorical variable. We can use factor() or as.factor() to create indicator variables for each value of race
table(nhanes$race)
## 
##    1    2 
## 3634 1440
sbp.model2 <- lm(sbp ~ bpb + age + factor(race), na.action = na.omit, data = nhanes)
summary(sbp.model2)
## 
## Call:
## lm(formula = sbp ~ bpb + age + factor(race), data = nhanes, na.action = na.omit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -61.773 -10.420  -1.257   8.525 101.680 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   93.88241    0.66373 141.446  < 2e-16 ***
## bpb            0.42848    0.07080   6.052 1.53e-09 ***
## age            0.61400    0.01195  51.378  < 2e-16 ***
## factor(race)2  2.66690    0.50308   5.301 1.20e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.89 on 5070 degrees of freedom
## Multiple R-squared:  0.3665, Adjusted R-squared:  0.3661 
## F-statistic: 977.8 on 3 and 5070 DF,  p-value: < 2.2e-16
#### Change the reference level for a factor variable
# Method 1: Make a new (or alter the old) variable
table(nhanes$race)
## 
##    1    2 
## 3634 1440
race2 <- relevel(factor(nhanes$race), ref = 2)
table(race2)
## race2
##    2    1 
## 1440 3634
sbp.model2 <- lm(sbp ~ bpb + age + factor(race2), na.action = na.omit, data = nhanes)
summary(sbp.model2)
## 
## Call:
## lm(formula = sbp ~ bpb + age + factor(race2), data = nhanes, 
##     na.action = na.omit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -61.773 -10.420  -1.257   8.525 101.680 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    96.54931    0.69301 139.319  < 2e-16 ***
## bpb             0.42848    0.07080   6.052 1.53e-09 ***
## age             0.61400    0.01195  51.378  < 2e-16 ***
## factor(race2)1 -2.66690    0.50308  -5.301 1.20e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.89 on 5070 degrees of freedom
## Multiple R-squared:  0.3665, Adjusted R-squared:  0.3661 
## F-statistic: 977.8 on 3 and 5070 DF,  p-value: < 2.2e-16
# Method 2: Change the contrasts in the lm() statement
sbp.model2 <- lm(sbp ~ bpb + age + C(factor(race), contr.treatment(2, base = 2)), na.action = na.omit, data = nhanes)
summary(sbp.model2)
## 
## Call:
## lm(formula = sbp ~ bpb + age + C(factor(race), contr.treatment(2, 
##     base = 2)), data = nhanes, na.action = na.omit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -61.773 -10.420  -1.257   8.525 101.680 
## 
## Coefficients:
##                                                Estimate Std. Error t value
## (Intercept)                                    96.54931    0.69301 139.319
## bpb                                             0.42848    0.07080   6.052
## age                                             0.61400    0.01195  51.378
## C(factor(race), contr.treatment(2, base = 2))1 -2.66690    0.50308  -5.301
##                                                Pr(>|t|)    
## (Intercept)                                     < 2e-16 ***
## bpb                                            1.53e-09 ***
## age                                             < 2e-16 ***
## C(factor(race), contr.treatment(2, base = 2))1 1.20e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.89 on 5070 degrees of freedom
## Multiple R-squared:  0.3665, Adjusted R-squared:  0.3661 
## F-statistic: 977.8 on 3 and 5070 DF,  p-value: < 2.2e-16
# R provides Type I sequential SS, not the default Type III marginal SS reported by SAS and SPSS.
# We can use the drop1() function to produce the familiar Type III results.
# It will compare each term with the full model.
anova(sbp.model2)
## Analysis of Variance Table
## 
## Response: sbp
##                                                 Df  Sum Sq Mean Sq  F value
## bpb                                              1   71318   71318  282.470
## age                                              1  662217  662217 2622.848
## C(factor(race), contr.treatment(2, base = 2))    1    7095    7095   28.102
## Residuals                                     5070 1280075     252         
##                                                  Pr(>F)    
## bpb                                           < 2.2e-16 ***
## age                                           < 2.2e-16 ***
## C(factor(race), contr.treatment(2, base = 2)) 1.199e-07 ***
## Residuals                                                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
drop1(sbp.model2, test = "F")
## Single term deletions
## 
## Model:
## sbp ~ bpb + age + C(factor(race), contr.treatment(2, base = 2))
##                                               Df Sum of Sq     RSS   AIC
## <none>                                                     1280075 28070
## bpb                                            1      9247 1289322 28104
## age                                            1    666469 1946544 30195
## C(factor(race), contr.treatment(2, base = 2))  1      7095 1287170 28096
##                                                F value    Pr(>F)    
## <none>                                                              
## bpb                                             36.625 1.535e-09 ***
## age                                           2639.688 < 2.2e-16 ***
## C(factor(race), contr.treatment(2, base = 2))   28.102 1.199e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Compare regression models

## Add other covariates to the model: which variables are biologically important?
## Add sex, BMI, educ and smk.
sbp.model3 <- lm(sbp ~ bpb + age + factor(race) + factor(sex) + bmi + factor(educ) + factor(smk), na.action = na.omit, data = nhanes)
summary(sbp.model3) # What covariates are associated with SBP? Do they make sense biologically?
## 
## Call:
## lm(formula = sbp ~ bpb + age + factor(race) + factor(sex) + bmi + 
##     factor(educ) + factor(smk), data = nhanes, na.action = na.omit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -59.202  -9.583  -1.317   7.926 101.116 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   83.160324   1.346404  61.765  < 2e-16 ***
## bpb            0.277763   0.075759   3.666 0.000249 ***
## age            0.615200   0.012258  50.189  < 2e-16 ***
## factor(race)2  2.024195   0.498277   4.062 4.93e-05 ***
## factor(sex)2  -3.912620   0.476734  -8.207 2.85e-16 ***
## bmi            0.530437   0.038131  13.911  < 2e-16 ***
## factor(educ)2 -0.003008   0.484418  -0.006 0.995046    
## factor(educ)3 -2.675500   0.711182  -3.762 0.000170 ***
## factor(smk)2  -1.991441   0.562961  -3.537 0.000408 ***
## factor(smk)3  -0.270575   0.559123  -0.484 0.628459    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.45 on 5021 degrees of freedom
##   (43 observations deleted due to missingness)
## Multiple R-squared:  0.3977, Adjusted R-squared:  0.3966 
## F-statistic: 368.4 on 9 and 5021 DF,  p-value: < 2.2e-16
# See output from models 1,2,and 3 side by side
stargazer(sbp.model1, sbp.model2, sbp.model3, type = "text", dep.var.labels = "Systolic Blood Pressure (mmHg)")
## 
## ==============================================================================================================================
##                                                                              Dependent variable:                              
##                                                -------------------------------------------------------------------------------
##                                                                        Systolic Blood Pressure (mmHg)                         
##                                                            (1)                        (2)                       (3)           
## ------------------------------------------------------------------------------------------------------------------------------
## bpb                                                     0.466***                   0.428***                  0.278***         
##                                                          (0.071)                    (0.071)                   (0.076)         
##                                                                                                                               
## age                                                     0.603***                   0.614***                  0.615***         
##                                                          (0.012)                    (0.012)                   (0.012)         
##                                                                                                                               
## C(factor(race), contr.treatment(2, base = 2))1                                     -2.667***                                  
##                                                                                     (0.503)                                   
##                                                                                                                               
## factor(race)2                                                                                                2.024***         
##                                                                                                               (0.498)         
##                                                                                                                               
## factor(sex)2                                                                                                 -3.913***        
##                                                                                                               (0.477)         
##                                                                                                                               
## bmi                                                                                                          0.530***         
##                                                                                                               (0.038)         
##                                                                                                                               
## factor(educ)2                                                                                                 -0.003          
##                                                                                                               (0.484)         
##                                                                                                                               
## factor(educ)3                                                                                                -2.675***        
##                                                                                                               (0.711)         
##                                                                                                                               
## factor(smk)2                                                                                                 -1.991***        
##                                                                                                               (0.563)         
##                                                                                                                               
## factor(smk)3                                                                                                  -0.271          
##                                                                                                               (0.559)         
##                                                                                                                               
## Constant                                                95.007***                  96.549***                 83.160***        
##                                                          (0.631)                    (0.693)                   (1.346)         
##                                                                                                                               
## ------------------------------------------------------------------------------------------------------------------------------
## Observations                                              5,074                      5,074                     5,031          
## R2                                                        0.363                      0.367                     0.398          
## Adjusted R2                                               0.363                      0.366                     0.397          
## Residual Std. Error                                15.932 (df = 5071)         15.890 (df = 5070)        15.445 (df = 5021)    
## F Statistic                                    1,444.936*** (df = 2; 5071) 977.807*** (df = 3; 5070) 368.374*** (df = 9; 5021)
## ==============================================================================================================================
## Note:                                                                                              *p<0.1; **p<0.05; ***p<0.01
### Check if alcohol consumption is a confounder
sbp.model4 <- lm(sbp ~ bpb + age + factor(race) + factor(sex) + bmi + factor(educ) + factor(smk) + factor(alc), na.action = na.omit, data = nhanes)
summary(sbp.model4) # Is alcohol associated with sbp?
## 
## Call:
## lm(formula = sbp ~ bpb + age + factor(race) + factor(sex) + bmi + 
##     factor(educ) + factor(smk) + factor(alc), data = nhanes, 
##     na.action = na.omit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -58.495  -9.580  -1.311   7.928 101.235 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   82.33776    1.41399  58.231  < 2e-16 ***
## bpb            0.24874    0.07705   3.228 0.001253 ** 
## age            0.61361    0.01276  48.106  < 2e-16 ***
## factor(race)2  2.00975    0.50657   3.967 7.37e-05 ***
## factor(sex)2  -3.87208    0.49390  -7.840 5.50e-15 ***
## bmi            0.55795    0.03860  14.454  < 2e-16 ***
## factor(educ)2  0.02761    0.49227   0.056 0.955279    
## factor(educ)3 -2.71562    0.72412  -3.750 0.000179 ***
## factor(smk)2  -2.07944    0.56937  -3.652 0.000263 ***
## factor(smk)3  -0.20883    0.57141  -0.365 0.714779    
## factor(alc)1   0.41798    0.49359   0.847 0.397144    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.38 on 4891 degrees of freedom
##   (172 observations deleted due to missingness)
## Multiple R-squared:  0.3967, Adjusted R-squared:  0.3954 
## F-statistic: 321.6 on 10 and 4891 DF,  p-value: < 2.2e-16
summary(sbp.model4)$coef #  Pull out relevant information from the output: Coefficient table
##                 Estimate Std. Error     t value     Pr(>|t|)
## (Intercept)   82.3377552 1.41399277 58.23067609 0.000000e+00
## bpb            0.2487418 0.07704799  3.22840133 1.253061e-03
## age            0.6136075 0.01275532 48.10601736 0.000000e+00
## factor(race)2  2.0097527 0.50656503  3.96741297 7.369686e-05
## factor(sex)2  -3.8720755 0.49390486 -7.83971931 5.502517e-15
## bmi            0.5579520 0.03860066 14.45446563 2.100456e-46
## factor(educ)2  0.0276076 0.49227488  0.05608169 9.552790e-01
## factor(educ)3 -2.7156205 0.72411551 -3.75025867 1.786946e-04
## factor(smk)2  -2.0794440 0.56937030 -3.65218209 2.627418e-04
## factor(smk)3  -0.2088304 0.57140734 -0.36546680 7.147788e-01
## factor(alc)1   0.4179774 0.49359253  0.84680655 3.971444e-01
summary(sbp.model4)$coef[2, 1] # Pull out just the beta coefficient blood Pb
## [1] 0.2487418
# 10% guideline for confounders: Does the addition of the new variable change the beta coefficient of interest by >10%?
(summary(sbp.model4)$coef[2, 1] - summary(sbp.model3)$coef[2, 1]) / summary(sbp.model3)$coef[2, 1] # Calculate the percent change in the blood Pb coefficient before and after alcohol in the model
## [1] -0.1044827
# Does alcohol meet the guideline for a confounder?

Reporting regression results for untransformed linear data

## Compute difference in SBP and 95% CI per one unit increase and IQR increase in bpb
# For one unit increase in exposure
summary(sbp.model4)$coef[2, 1] # Beta coefficient
## [1] 0.2487418
summary(sbp.model4)$coef[2, 1] - 1.96 * summary(sbp.model4)$coef[2, 2] # Lower confidence interval
## [1] 0.09772778
summary(sbp.model4)$coef[2, 1] + 1.96 * summary(sbp.model4)$coef[2, 2] # Upper confidence interval
## [1] 0.3997559
regress.display(sbp.model4) # Alternative: Convenience function to show all of the CI with the beta coefficients
##  
##                    Coeff  lower095ci upper095ci       Pr>|t|
## (Intercept)   82.3377552 79.56569429 85.1098161 0.000000e+00
## bpb            0.2487418  0.09769317  0.3997905 1.253061e-03
## age            0.6136075  0.58860131  0.6386136 0.000000e+00
## factor(race)2  2.0097527  1.01665770  3.0028476 7.369686e-05
## factor(sex)2  -3.8720755 -4.84035080 -2.9038001 5.502517e-15
## bmi            0.5579520  0.48227733  0.6336266 2.100456e-46
## factor(educ)2  0.0276076 -0.93747225  0.9926875 9.552790e-01
## factor(educ)3 -2.7156205 -4.13521210 -1.2960289 1.786946e-04
## factor(smk)2  -2.0794440 -3.19566554 -0.9632225 2.627418e-04
## factor(smk)3  -0.2088304 -1.32904543  0.9113846 7.147788e-01
## factor(alc)1   0.4179774 -0.54968566  1.3856404 3.971444e-01
# To improve communication of findings, report association for an IQR increase in exposure 
IQR(nhanes$bpb)
## [1] 3.1
IQR(nhanes$bpb) * summary(sbp.model4)$coef[2, 1] # Per IQR increase in exposure, beta coefficient 
## [1] 0.7710997
l95ci.iqr <- IQR(nhanes$bpb) * (summary(sbp.model4)$coef[2, 1] - 1.96 * summary(sbp.model4)$coef[2, 2]) # Per IQR increase in exposure, lower confidence interval
u95ci.iqr <- IQR(nhanes$bpb) * (summary(sbp.model4)$coef[2, 1] + 1.96 * summary(sbp.model4)$coef[2, 2]) # Per IQR increase in exposure, upper confidence interval
regress.display(sbp.model4)$table[2, ] * IQR(nhanes$bpb) # Multiply the convenience table by the IQR
##      Coeff lower095ci upper095ci     Pr>|t| 
## 0.77109973 0.30284884 1.23935061 0.00388449

Compare regression results

## Does the variable packyrs give a better estimate than smk?
sbp.model5 <- lm(sbp ~ bpb + age + factor(race) + factor(sex) + bmi + factor(educ) + packyrs + factor(alc), na.action = na.omit, data = nhanes)
summary(sbp.model5)
## 
## Call:
## lm(formula = sbp ~ bpb + age + factor(race) + factor(sex) + bmi + 
##     factor(educ) + packyrs + factor(alc), data = nhanes, na.action = na.omit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -59.679  -9.666  -1.233   8.026 101.502 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   82.04513    1.42080  57.746  < 2e-16 ***
## bpb            0.24963    0.07906   3.157 0.001602 ** 
## age            0.61464    0.01317  46.658  < 2e-16 ***
## factor(race)2  2.22727    0.51892   4.292 1.81e-05 ***
## factor(sex)2  -3.70826    0.50585  -7.331 2.68e-13 ***
## bmi            0.55073    0.03930  14.012  < 2e-16 ***
## factor(educ)2  0.04590    0.50476   0.091 0.927547    
## factor(educ)3 -2.60357    0.73817  -3.527 0.000424 ***
## packyrs       -0.02462    0.01097  -2.244 0.024860 *  
## factor(alc)1   0.32565    0.49934   0.652 0.514333    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.42 on 4677 degrees of freedom
##   (387 observations deleted due to missingness)
## Multiple R-squared:  0.3952, Adjusted R-squared:  0.394 
## F-statistic: 339.6 on 9 and 4677 DF,  p-value: < 2.2e-16
AIC(sbp.model4)
## [1] 40722.09
AIC(sbp.model5)
## [1] 38957.68
## Which model has the lower AIC? The one with smk (model 4) or the one with packyrs (model 5)? 
## Model 5 also dropped 215 more people. This is an analyst judgment call (no perfect answer). I would probably stick with model 4 since the AIC are close, to keep the people in the study.

## Run two models: One with age and one adding age as a quadratic function.
## Does the quadratic term improve the fit of the model?
sbp.model6 <- lm(sbp ~ bpb + age + factor(race) + factor(sex) + bmi + factor(educ) + factor(smk) + factor(alc) + I(age^2), na.action = na.omit, data = nhanes)
summary(sbp.model6) # Is the age squared term significant?
## 
## Call:
## lm(formula = sbp ~ bpb + age + factor(race) + factor(sex) + bmi + 
##     factor(educ) + factor(smk) + factor(alc) + I(age^2), data = nhanes, 
##     na.action = na.omit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -59.724  -9.518  -1.413   7.689 101.809 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   87.0142129  1.8737032  46.440  < 2e-16 ***
## bpb            0.2582957  0.0769837   3.355 0.000799 ***
## age            0.3452699  0.0718091   4.808 1.57e-06 ***
## factor(race)2  2.0928803  0.5063451   4.133 3.64e-05 ***
## factor(sex)2  -3.7960138  0.4936354  -7.690 1.77e-14 ***
## bmi            0.5913718  0.0395399  14.956  < 2e-16 ***
## factor(educ)2  0.0989887  0.4919604   0.201 0.840541    
## factor(educ)3 -2.4208295  0.7272801  -3.329 0.000879 ***
## factor(smk)2  -1.8118191  0.5729428  -3.162 0.001575 ** 
## factor(smk)3   0.0969062  0.5762782   0.168 0.866465    
## factor(alc)1   0.4773454  0.4931648   0.968 0.333131    
## I(age^2)       0.0026033  0.0006856   3.797 0.000148 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.36 on 4890 degrees of freedom
##   (172 observations deleted due to missingness)
## Multiple R-squared:  0.3984, Adjusted R-squared:  0.3971 
## F-statistic: 294.4 on 11 and 4890 DF,  p-value: < 2.2e-16
## Use the anova function to compare the 2 models and
## see if the quadratic term improves the model.
anova(sbp.model4, sbp.model6, test = "F")
## Analysis of Variance Table
## 
## Model 1: sbp ~ bpb + age + factor(race) + factor(sex) + bmi + factor(educ) + 
##     factor(smk) + factor(alc)
## Model 2: sbp ~ bpb + age + factor(race) + factor(sex) + bmi + factor(educ) + 
##     factor(smk) + factor(alc) + I(age^2)
##   Res.Df     RSS Df Sum of Sq      F    Pr(>F)    
## 1   4891 1157606                                  
## 2   4890 1154203  1      3403 14.418 0.0001482 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## We'll use sbp.model6 as our final model
## What is the relationship between bpb and sbp?

Regression diagnostics

### Regression Diagnostics
## In the case of linear model, the plot of the model gives diagnostic plots
par(mfrow = c(1, 1))
plot(sbp.model6)

par(mfrow = c(2, 2))
plot(sbp.model6, id.n = 5, cex = 0.1)

# plot.lm(sbp.model6)

par(mfrow = c(1, 1))
plot(sbp.model6, which = 1) # Can ask for the four plots one at a time

plot(sbp.model6, which = 2)

plot(sbp.model6, which = 3)

plot(sbp.model6, which = 5)

plot(sbp.model6, which = 4) # There is an extra, hidden plot you can call out individually

## How about log(sbp)?
## Check how diagnostic plots using log-transformed sbp look like

sbp.model6.log <- lm(log(sbp) ~ bpb + age + factor(race) + factor(sex) + bmi + factor(educ) + factor(smk) + factor(alc) + I(age^2), na.action = na.omit, data = nhanes)
summary(sbp.model6.log)
## 
## Call:
## lm(formula = log(sbp) ~ bpb + age + factor(race) + factor(sex) + 
##     bmi + factor(educ) + factor(smk) + factor(alc) + I(age^2), 
##     data = nhanes, na.action = na.omit)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.53660 -0.07304 -0.00487  0.06676  0.57499 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    4.505e+00  1.401e-02 321.638  < 2e-16 ***
## bpb            1.974e-03  5.755e-04   3.430 0.000609 ***
## age            3.281e-03  5.368e-04   6.113 1.06e-09 ***
## factor(race)2  1.535e-02  3.785e-03   4.054 5.11e-05 ***
## factor(sex)2  -3.504e-02  3.690e-03  -9.496  < 2e-16 ***
## bmi            4.864e-03  2.956e-04  16.455  < 2e-16 ***
## factor(educ)2  1.928e-03  3.678e-03   0.524 0.600129    
## factor(educ)3 -1.837e-02  5.437e-03  -3.379 0.000734 ***
## factor(smk)2  -1.415e-02  4.283e-03  -3.303 0.000964 ***
## factor(smk)3   5.033e-04  4.308e-03   0.117 0.906992    
## factor(alc)1   5.098e-03  3.687e-03   1.383 0.166742    
## I(age^2)       1.389e-05  5.125e-06   2.710 0.006758 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1148 on 4890 degrees of freedom
##   (172 observations deleted due to missingness)
## Multiple R-squared:  0.4159, Adjusted R-squared:  0.4146 
## F-statistic: 316.5 on 11 and 4890 DF,  p-value: < 2.2e-16
plot(sbp.model6.log)

hist(residuals(sbp.model6.log), nclass = 20)

par(mfrow = c(2, 2))
plot(sbp.model6, which = 1)
plot(sbp.model6.log, which = 1)
hist(residuals(sbp.model6), main = "Histogram of res(SBP)")
hist(residuals(sbp.model6.log), main = "Histogram of res(log(SBP))")

par(mfrow = c(1, 1))

### Partial Residual Plots
## Plot sbp vs. bpb given that other variables are in the model (adjusted)

## This can be done by 'termplot'
termplot(sbp.model6, partial.resid = TRUE, col.res = "gray30", cex = 0.3, smooth = panel.smooth)

Suppose you decide to go with log transformed SBP.

Compute percent increase in SBP and 95% CI per one unit increase and IQR increase in bpb

what to report if y is log-transformed?

percent increase (difference) vs. absolute increase (difference)?

Compute percent increase in SBP and 95% CI per one unit increase and IQR increase in bpb

summary(sbp.model6.log)$coef
##                    Estimate   Std. Error     t value     Pr(>|t|)
## (Intercept)    4.504957e+00 1.400631e-02 321.6375700 0.000000e+00
## bpb            1.973727e-03 5.754686e-04   3.4297737 6.090800e-04
## age            3.281259e-03 5.367875e-04   6.1127716 1.055187e-09
## factor(race)2  1.534593e-02 3.785033e-03   4.0543714 5.105082e-05
## factor(sex)2  -3.504012e-02 3.690025e-03  -9.4959008 3.321831e-21
## bmi            4.863685e-03 2.955687e-04  16.4553462 2.932179e-59
## factor(educ)2  1.927926e-03 3.677504e-03   0.5242485 6.001295e-01
## factor(educ)3 -1.836873e-02 5.436567e-03  -3.3787365 7.338916e-04
## factor(smk)2  -1.414501e-02 4.282864e-03  -3.3026976 9.644763e-04
## factor(smk)3   5.033236e-04 4.307797e-03   0.1168401 9.069915e-01
## factor(alc)1   5.098266e-03 3.686508e-03   1.3829529 1.667424e-01
## I(age^2)       1.388725e-05 5.125029e-06   2.7096926 6.758042e-03
summary(sbp.model6.log)$coef[2, 1]
## [1] 0.001973727
summary(sbp.model6.log)$coef[2, 2]
## [1] 0.0005754686
exp(summary(sbp.model6.log)$coef[2, 1])
## [1] 1.001976
# exp(sbp.model6.log$coef[2])
100 * (exp(summary(sbp.model6.log)$coef[2, 1]) - 1) # Per one unit increase in blood Pb, percent change in systolic blood pressure
## [1] 0.1975676
100 * (exp(summary(sbp.model6.log)$coef[2, 1] - 1.96 * summary(sbp.model6.log)$coef[2, 2]) - 1) # Lower confidence interval
## [1] 0.08461663
100 * (exp(summary(sbp.model6.log)$coef[2, 1] + 1.96 * summary(sbp.model6.log)$coef[2, 2]) - 1) # Upper confidence interval
## [1] 0.310646
IQR(nhanes$bpb)
## [1] 3.1
100 * (exp(IQR(nhanes$bpb) * summary(sbp.model6.log)$coef[2, 1]) - 1) # Per IQR unit increase in blood Pb, percent change in systolic blood pressure
## [1] 0.613731
100 * (exp(IQR(nhanes$bpb) * (summary(sbp.model6.log)$coef[2, 1] - 1.96 * summary(sbp.model6.log)$coef[2, 2])) - 1) # Lower confidence interval
## [1] 0.2625447
100 * (exp(IQR(nhanes$bpb) * (summary(sbp.model6.log)$coef[2, 1] + 1.96 * summary(sbp.model6.log)$coef[2, 2])) - 1) # Upper confidence interval
## [1] 0.9661474

linearity assumption

Non-linear associations

# What if the relationship between blood lead and sbp is non-linear? 
library(mgcv)
## Loading required package: nlme
## This is mgcv 1.8-33. For overview type 'help("mgcv-package")'.
## 
## Attaching package: 'mgcv'
## The following object is masked from 'package:nnet':
## 
##     multinom
sbp.model6.gam <- gam(sbp ~ s(bpb) + age + factor(race) + factor(sex) + bmi + factor(educ) + factor(smk) + factor(alc) + I(age^2), na.action = na.omit, data = nhanes)
summary(sbp.model6.gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## sbp ~ s(bpb) + age + factor(race) + factor(sex) + bmi + factor(educ) + 
##     factor(smk) + factor(alc) + I(age^2)
## 
## Parametric coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   88.2143968  1.8617921  47.381  < 2e-16 ***
## age            0.3363437  0.0718470   4.681 2.93e-06 ***
## factor(race)2  2.0313316  0.5062947   4.012 6.11e-05 ***
## factor(sex)2  -3.5783017  0.5001717  -7.154 9.67e-13 ***
## bmi            0.5931173  0.0395125  15.011  < 2e-16 ***
## factor(educ)2  0.1625592  0.4919648   0.330 0.741090    
## factor(educ)3 -2.2564780  0.7288554  -3.096 0.001973 ** 
## factor(smk)2  -1.8333930  0.5724650  -3.203 0.001371 ** 
## factor(smk)3  -0.0462355  0.5780681  -0.080 0.936254    
## factor(alc)1   0.4435799  0.4929664   0.900 0.368262    
## I(age^2)       0.0026407  0.0006852   3.854 0.000118 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##         edf Ref.df    F  p-value    
## s(bpb) 2.36  3.014 6.61 0.000184 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.398   Deviance explained =   40%
## GCV = 236.21  Scale est. = 235.57    n = 4902
plot(sbp.model6.gam)

plot(sbp.model6.gam, xlab = "Blood lead (ug/dL)", ylab = "Change in log(SBP)") # Does this relationship look linear?

# Option to log transform the exposure
sbp.model7 <- lm(sbp ~ log(bpb) + age + factor(race) + factor(sex) + bmi + factor(educ) + factor(smk) + factor(alc) + I(age^2), na.action = na.omit, data = nhanes)
summary(sbp.model7)
## 
## Call:
## lm(formula = sbp ~ log(bpb) + age + factor(race) + factor(sex) + 
##     bmi + factor(educ) + factor(smk) + factor(alc) + I(age^2), 
##     data = nhanes, na.action = na.omit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -59.464  -9.543  -1.339   7.746 103.034 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   86.8708956  1.8722723  46.399  < 2e-16 ***
## log(bpb)       1.4222563  0.3526023   4.034 5.58e-05 ***
## age            0.3316940  0.0719935   4.607 4.18e-06 ***
## factor(race)2  2.0361714  0.5066138   4.019 5.93e-05 ***
## factor(sex)2  -3.5467680  0.5053567  -7.018 2.55e-12 ***
## bmi            0.5902534  0.0394943  14.945  < 2e-16 ***
## factor(educ)2  0.1417779  0.4914805   0.288  0.77300    
## factor(educ)3 -2.2839018  0.7293924  -3.131  0.00175 ** 
## factor(smk)2  -1.8271820  0.5726692  -3.191  0.00143 ** 
## factor(smk)3  -0.0474241  0.5795115  -0.082  0.93478    
## factor(alc)1   0.4312724  0.4933101   0.874  0.38203    
## I(age^2)       0.0026700  0.0006858   3.893  0.00010 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.36 on 4890 degrees of freedom
##   (172 observations deleted due to missingness)
## Multiple R-squared:  0.399,  Adjusted R-squared:  0.3977 
## F-statistic: 295.2 on 11 and 4890 DF,  p-value: < 2.2e-16
summary(sbp.model6)
## 
## Call:
## lm(formula = sbp ~ bpb + age + factor(race) + factor(sex) + bmi + 
##     factor(educ) + factor(smk) + factor(alc) + I(age^2), data = nhanes, 
##     na.action = na.omit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -59.724  -9.518  -1.413   7.689 101.809 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   87.0142129  1.8737032  46.440  < 2e-16 ***
## bpb            0.2582957  0.0769837   3.355 0.000799 ***
## age            0.3452699  0.0718091   4.808 1.57e-06 ***
## factor(race)2  2.0928803  0.5063451   4.133 3.64e-05 ***
## factor(sex)2  -3.7960138  0.4936354  -7.690 1.77e-14 ***
## bmi            0.5913718  0.0395399  14.956  < 2e-16 ***
## factor(educ)2  0.0989887  0.4919604   0.201 0.840541    
## factor(educ)3 -2.4208295  0.7272801  -3.329 0.000879 ***
## factor(smk)2  -1.8118191  0.5729428  -3.162 0.001575 ** 
## factor(smk)3   0.0969062  0.5762782   0.168 0.866465    
## factor(alc)1   0.4773454  0.4931648   0.968 0.333131    
## I(age^2)       0.0026033  0.0006856   3.797 0.000148 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.36 on 4890 degrees of freedom
##   (172 observations deleted due to missingness)
## Multiple R-squared:  0.3984, Adjusted R-squared:  0.3971 
## F-statistic: 294.4 on 11 and 4890 DF,  p-value: < 2.2e-16
## compare model6 and model7
AIC(sbp.model6, sbp.model7) # Which model has the lower AIC? The spline term or the log term for blood Pb?
##            df      AIC
## sbp.model6 13 40709.65
## sbp.model7 13 40704.64

Effect modification by sex

# What if the relationship between blood Pb and sbp varies by sex?
table(nhanes$sex)
## 
##    1    2 
## 2335 2739
### Stratified by sex
## Male (sex==1)
sbp.model6.male <- lm(sbp ~ bpb + age + factor(race) + bmi + factor(educ)
  + factor(smk) + factor(alc) + I(age^2), data = nhanes, subset = (sex == 1))
summary(sbp.model6.male) # What is the beta coefficient for blood Pb in males?
## 
## Call:
## lm(formula = sbp ~ bpb + age + factor(race) + bmi + factor(educ) + 
##     factor(smk) + factor(alc) + I(age^2), data = nhanes, subset = (sex == 
##     1))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -55.951  -9.164  -1.214   7.452  67.763 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   94.0698502  2.7370216  34.369  < 2e-16 ***
## bpb            0.2201841  0.0907801   2.425 0.015367 *  
## age            0.0247105  0.1020797   0.242 0.808748    
## factor(race)2  2.6271948  0.7251217   3.623 0.000298 ***
## bmi            0.7190556  0.0667827  10.767  < 2e-16 ***
## factor(educ)2 -0.1580127  0.6931503  -0.228 0.819696    
## factor(educ)3 -1.9583250  0.9739884  -2.011 0.044485 *  
## factor(smk)2  -0.2674489  0.7894817  -0.339 0.734818    
## factor(smk)3   1.5625824  0.8126909   1.923 0.054641 .  
## factor(alc)1  -0.1923135  0.6709470  -0.287 0.774422    
## I(age^2)       0.0043354  0.0009734   4.454 8.84e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.6 on 2240 degrees of freedom
##   (84 observations deleted due to missingness)
## Multiple R-squared:  0.3134, Adjusted R-squared:  0.3103 
## F-statistic: 102.3 on 10 and 2240 DF,  p-value: < 2.2e-16
## Female (sex==2)
sbp.model6.female <- lm(sbp ~ bpb + age + factor(race) + bmi + factor(educ)
  + factor(smk) + factor(alc) + I(age^2), data = nhanes, subset = (sex == 2))
summary(sbp.model6.female) # What is the beta coefficient for blood Pb in females? Is it similar to that in males?
## 
## Call:
## lm(formula = sbp ~ bpb + age + factor(race) + bmi + factor(educ) + 
##     factor(smk) + factor(alc) + I(age^2), data = nhanes, subset = (sex == 
##     2))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -57.975  -9.454  -1.348   7.659  99.869 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   77.0605375  2.4865167  30.991  < 2e-16 ***
## bpb            0.2931960  0.1343263   2.183  0.02914 *  
## age            0.5576526  0.0990458   5.630 1.99e-08 ***
## factor(race)2  2.0982646  0.6949672   3.019  0.00256 ** 
## bmi            0.5305908  0.0492797  10.767  < 2e-16 ***
## factor(educ)2  0.3947007  0.6828269   0.578  0.56329    
## factor(educ)3 -2.1735912  1.0641023  -2.043  0.04119 *  
## factor(smk)2  -1.4335988  0.8458868  -1.695  0.09023 .  
## factor(smk)3  -0.5512632  0.8104050  -0.680  0.49642    
## factor(alc)1   0.6533917  0.7063162   0.925  0.35501    
## I(age^2)       0.0016206  0.0009452   1.715  0.08655 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.66 on 2640 degrees of freedom
##   (88 observations deleted due to missingness)
## Multiple R-squared:  0.4611, Adjusted R-squared:  0.4591 
## F-statistic: 225.9 on 10 and 2640 DF,  p-value: < 2.2e-16
### Use interaction model
sbp.model6.int <- lm(sbp ~ bpb + factor(sex) + bpb * factor(sex) + age + factor(race) + bmi + factor(educ)
  + factor(smk) + factor(alc) + I(age^2), data = nhanes)
# below is the same
sbp.model6.int <- lm(sbp ~ bpb * factor(sex) + age + factor(race) + bmi + factor(educ)
  + factor(smk) + factor(alc) + I(age^2), data = nhanes)
summary(sbp.model6.int) # Is the interaction term significant?
## 
## Call:
## lm(formula = sbp ~ bpb * factor(sex) + age + factor(race) + bmi + 
##     factor(educ) + factor(smk) + factor(alc) + I(age^2), data = nhanes)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -58.894  -9.530  -1.369   7.642 102.461 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      87.8854794  1.9054676  46.123  < 2e-16 ***
## bpb               0.1317420  0.0923617   1.426 0.153825    
## factor(sex)2     -5.1875500  0.7476820  -6.938 4.49e-12 ***
## age               0.3385020  0.0718234   4.713 2.51e-06 ***
## factor(race)2     2.0885416  0.5060825   4.127 3.74e-05 ***
## bmi               0.5905818  0.0395204  14.944  < 2e-16 ***
## factor(educ)2     0.1078127  0.4917151   0.219 0.826458    
## factor(educ)3    -2.4434079  0.7269556  -3.361 0.000782 ***
## factor(smk)2     -1.7832085  0.5727587  -3.113 0.001860 ** 
## factor(smk)3      0.1024602  0.5759802   0.178 0.858818    
## factor(alc)1      0.4787710  0.4929064   0.971 0.331436    
## I(age^2)          0.0026405  0.0006854   3.853 0.000118 ***
## bpb:factor(sex)2  0.3796814  0.1532848   2.477 0.013284 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.36 on 4889 degrees of freedom
##   (172 observations deleted due to missingness)
## Multiple R-squared:  0.3992, Adjusted R-squared:  0.3977 
## F-statistic: 270.7 on 12 and 4889 DF,  p-value: < 2.2e-16

Exercise 6A

This is your last homework assignment for the semester.

6.3. Generalized Linear Models: Association between hypertension (htn) and blood lead (bpb)

## Logistic regression for hypertension
## Look at hypertension status (htn)
tab1(nhanes$htn, graph = F)
## nhanes$htn : 
##         Frequency Percent Cum. percent
## 0            3135    61.8         61.8
## 1            1939    38.2        100.0
##   Total      5074   100.0        100.0
htn.model <- glm(htn ~ bpb + age + factor(sex) + factor(race) + bmi + factor(educ) + factor(smk) + factor(alc),
  family = binomial,
  na.action = na.omit, data = nhanes)
summary(htn.model)
## 
## Call:
## glm(formula = htn ~ bpb + age + factor(sex) + factor(race) + 
##     bmi + factor(educ) + factor(smk) + factor(alc), family = binomial, 
##     data = nhanes, na.action = na.omit)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4998  -0.7949  -0.4272   0.8959   2.5147  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -6.635396   0.261535 -25.371  < 2e-16 ***
## bpb            0.023959   0.011750   2.039   0.0414 *  
## age            0.061698   0.002235  27.602  < 2e-16 ***
## factor(sex)2   0.063134   0.077426   0.815   0.4148    
## factor(race)2  0.494599   0.080020   6.181 6.37e-10 ***
## bmi            0.097071   0.006285  15.446  < 2e-16 ***
## factor(educ)2  0.075786   0.076618   0.989   0.3226    
## factor(educ)3 -0.085220   0.114171  -0.746   0.4554    
## factor(smk)2  -0.037067   0.086530  -0.428   0.6684    
## factor(smk)3   0.080902   0.091358   0.886   0.3759    
## factor(alc)1   0.008554   0.077201   0.111   0.9118    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6500.8  on 4901  degrees of freedom
## Residual deviance: 5080.9  on 4891  degrees of freedom
##   (172 observations deleted due to missingness)
## AIC: 5102.9
## 
## Number of Fisher Scoring iterations: 4
# Compute ORs
logistic.display(htn.model)
##  
##                      OR lower95ci upper95ci      Pr(>|Z|)
## bpb           1.0242485 1.0009304  1.048110  4.143898e-02
## age           1.0636409 1.0589913  1.068311 1.040118e-167
## factor(sex)2  1.0651698 0.9151960  1.239720  4.148329e-01
## factor(race)2 1.6398400 1.4018069  1.918292  6.372658e-10
## bmi           1.1019384 1.0884485  1.115596  8.036933e-54
## factor(educ)2 1.0787320 0.9283169  1.253519  3.225922e-01
## factor(educ)3 0.9183104 0.7341876  1.148609  4.554123e-01
## factor(smk)2  0.9636111 0.8132928  1.141712  6.683781e-01
## factor(smk)3  1.0842650 0.9065078  1.296879  3.758574e-01
## factor(alc)1  1.0085903 0.8669650  1.173351  9.117772e-01
# Regression diagnostics
par(mfrow = c(2, 2))
plot(htn.model)

par(mfrow = c(1, 1))
termplot(htn.model, se = T, partial.resid = T)

Stepwise variable selection

Poisson Regression for count outcomes.

Code is available, but we will skip going through this during class

# ## Poisson regression for respiratory death: Montana dataset from epiDisplay
# data(Montana)
# summ(Montana)
# head(Montana, 10)
# hist(Montana$respdeath)
# 
# par(mfrow = c(2, 2))
# tab1(Montana$agegr)
# tab1(Montana$period)
# tab1(Montana$start)
# tab1(Montana$arsenic)
# 
# ## label categorical variables
# Montana$agegr <- factor(Montana$agegr, labels = c("40-49", "50-59", "60-69", "70-79"))
# Montana$period <- factor(Montana$period, labels = c("1938-1949", "1950-1959", "1960-1969", "1970-1977"))
# Montana$start <- factor(Montana$start, labels = c("pre-1925", "1925 & after"))
# Montana$arsenic <- factor(Montana$arsenic, labels = c("<1 year", "1-4 years", "5-14 years", "15+ years"))
# 
# tab1(Montana$agegr, missing = F)
# tab1(Montana$period, missing = F)
# tab1(Montana$start, missing = F)
# tab1(Montana$arsenic, missing = F)
# par(mfrow = c(1, 1))
# 
# ## Compute incidence rate by age and period
# table.pyears <- tapply(Montana$personyrs, list(Montana$period, Montana$agegr), sum)
# table.deaths <- tapply(Montana$respdeath, list(Montana$period, Montana$agegr), sum)
# table.inc10000 <- table.deaths / table.pyears * 10000
# table.inc10000
# 
# ## create a time-series plot of the incidence
# plot.ts(table.inc10000, plot.type = "single", xlab = "", ylab = "#/10,000 person-years", xaxt = "n", col = c("black", "blue", "red", "green"), lty = c(2, 1, 1, 2), las = 1)
# points(rep(1:4, 4), table.inc10000, pch = 22, cex = table.pyears / sum(table.pyears) * 20)
# title(main = "Incidence by age and period")
# axis(side = 1, at = 1:4, labels = levels(Montana$period))
# legend("topleft", legend = levels(Montana$agegr)[4:1], col = c("green", "red", "blue", "black"), bg = "white", lty = c(2, 1, 1, 2))
# 
# ## check arsenic
# tab1(Montana$arsenic)
# tapply(Montana$respdeath, Montana$arsenic, mean)
# tapply(Montana$personyrs, Montana$arsenic, mean)
# 
# ## Poisson model
# resp.mode11 <- glm(respdeath ~ period, offset = log(personyrs), family = poisson, data = Montana)
# summary(resp.mode11)
# 
# resp.mode12 <- glm(respdeath ~ agegr, offset = log(personyrs), family = poisson, data = Montana)
# summary(resp.mode12)
# 
# resp.mode13 <- glm(respdeath ~ period + agegr, offset = log(personyrs), family = poisson, data = Montana)
# summary(resp.mode13)
# 
# AIC(resp.mode11, resp.mode12, resp.mode13)
# ## model2 is better
# 
# resp.mode14 <- glm(respdeath ~ agegr + arsenic, offset = log(personyrs), family = poisson, data = Montana)
# summary(resp.mode14)
# 
# ## is there a linear trend across arsenic exposure?
# resp.mode14.lin <- glm(respdeath ~ agegr + as.numeric(arsenic), offset = log(personyrs), family = poisson, data = Montana)
# summary(resp.mode14.lin)
# 
# ## compute IRR
# idr.display(resp.mode14)

Optional: Exercise 6B

Not a graded assignment. Optional.

6.5. Matched Case-Control Study: VC1to1 from epiDisplay

# Matched case-control dataset available in the epiDisplay package (packages can contain practice data as well as functions)
data(VC1to1)
summ(VC1to1) # What is the shape/structure of this dataset?
## 
## No. of observations = 52
## 
##   Var. name obs. mean   median  s.d.   min.   max.  
## 1 matset    52   13.5   13.5    7.57   1      26    
## 2 case      52   0.5    0.5     0.5    0      1     
## 3 smoking   52   0.81   1       0.4    0      1     
## 4 rubber    52   0.33   0       0.47   0      1     
## 5 alcohol   52   0.52   1       0.5    0      1
head(VC1to1) # 1 case matched to 1 control
##   matset case smoking rubber alcohol
## 1      1    1       1      0       0
## 2      1    0       1      0       0
## 3      2    1       1      0       1
## 4      2    0       1      1       0
## 5      3    1       1      1       0
## 6      3    0       1      1       0
# Reshape the data to facilitate data exploration
# function 'reshape' converts wide to long or vice versa
wide <- reshape(VC1to1, timevar = "case", v.names = c("smoking", "rubber", "alcohol"), idvar = "matset", direction = "wide")
head(wide, 3)
##   matset smoking.1 rubber.1 alcohol.1 smoking.0 rubber.0 alcohol.0
## 1      1         1        0         0         1        0         0
## 3      2         1        0         1         1        1         0
## 5      3         1        1         0         1        1         0
table(wide$smoking.1, wide$smoking.0, dnn = c("smoking in case", "smoking in control"))
##                smoking in control
## smoking in case  0  1
##               0  0  5
##               1  5 16
# dnn: dimnames names

# matchTab() is for the conditional OR (McNemar's OR)
matchTab(VC1to1$case, VC1to1$smoking, strata = VC1to1$matset) # Is smoking exposure associated with cancer?
## 
## Exposure status: $ = 1 
##  
##  Exposure status: VC1to1 = 1 
##  
##  Exposure status: smoking = 1 
##  
## Total number of match sets in the tabulation = 26 
##  
## Number of controls = 1 
##                     No. of controls exposed
## No. of cases exposed  0  1
##                    0  0  5
##                    1  5 16
## 
## Odds ratio by Mantel-Haenszel method = 1 
##  
## Odds ratio by maximum likelihood estimate (MLE) method = 1 
##  95%CI= 0.29 , 3.454 
## 
matchTab(VC1to1$case, VC1to1$rubber, strata = VC1to1$matset) # Is working in the rubber industry associated with cancer?
## 
## Exposure status: $ = 1 
##  
##  Exposure status: VC1to1 = 1 
##  
##  Exposure status: rubber = 1 
##  
## Total number of match sets in the tabulation = 26 
##  
## Number of controls = 1 
##                     No. of controls exposed
## No. of cases exposed  0  1
##                    0 13  5
##                    1  4  4
## 
## Odds ratio by Mantel-Haenszel method = 0.8 
##  
## Odds ratio by maximum likelihood estimate (MLE) method = 0.8 
##  95%CI= 0.215 , 2.979 
## 
matchTab(VC1to1$case, VC1to1$alcohol, strata = VC1to1$matset) # Is alcohol consumption associated iwth cancer?
## 
## Exposure status: $ = 1 
##  
##  Exposure status: VC1to1 = 1 
##  
##  Exposure status: alcohol = 1 
##  
## Total number of match sets in the tabulation = 26 
##  
## Number of controls = 1 
##                     No. of controls exposed
## No. of cases exposed 0 1
##                    0 7 2
##                    1 9 8
## 
## Odds ratio by Mantel-Haenszel method = 4.5 
##  
## Odds ratio by maximum likelihood estimate (MLE) method = 4.5 
##  95%CI= 0.972 , 20.827 
## 
## look at the full dataset VC1to6
data(VC1to6) # 1 case matched to up to 6 controls
summ(VC1to6)
## 
## No. of observations = 119
## 
##   Var. name obs. mean   median  s.d.   min.   max.  
## 1 matset    119  15.75  17      6.96   1      26    
## 2 case      119  0.22   0       0.41   0      1     
## 3 smoking   119  0.7    1       0.46   0      1     
## 4 rubber    119  0.37   0       0.48   0      1     
## 5 alcohol   119  0.42   0       0.5    0      1
VC1to6[, ]
##     matset case smoking rubber alcohol
## 1        1    1       1      0       0
## 2        1    0       1      0       0
## 3        2    1       1      0       1
## 4        2    0       1      1       0
## 5        3    1       1      1       0
## 6        3    0       1      1       0
## 7        4    1       1      0       0
## 8        4    0       1      1       1
## 9        4    0       0      1       1
## 10       5    1       0      0       1
## 11       5    0       1      0       0
## 12       5    0       1      0       0
## 13       6    1       1      0       1
## 14       6    0       0      0       0
## 15       6    0       0      0       0
## 16       7    1       1      0       1
## 17       7    0       1      0       0
## 18       7    0       1      0       1
## 19       7    0       1      1       0
## 20       8    1       1      0       0
## 21       8    0       1      0       0
## 22       8    0       1      0       1
## 23       8    0       0      1       0
## 24       9    1       1      1       1
## 25       9    0       1      1       0
## 26       9    0       1      1       0
## 27       9    0       0      1       0
## 28      10    1       0      0       0
## 29      10    0       1      1       0
## 30      10    0       0      0       0
## 31      10    0       1      0       0
## 32      11    1       1      0       1
## 33      11    0       1      0       1
## 34      11    0       0      0       0
## 35      11    0       1      0       1
## 36      12    1       1      0       0
## 37      12    0       1      0       1
## 38      12    0       1      0       0
## 39      12    0       0      0       0
## 40      13    1       1      1       0
## 41      13    0       0      0       0
## 42      13    0       0      0       0
## 43      13    0       0      0       0
## 44      14    1       1      0       1
## 45      14    0       1      0       1
## 46      14    0       0      0       0
## 47      14    0       1      1       0
## 48      14    0       1      0       1
## 49      15    1       1      0       1
## 50      15    0       1      0       0
## 51      15    0       0      0       0
## 52      15    0       1      0       0
## 53      15    0       1      0       1
## 54      16    1       1      0       1
## 55      16    0       0      0       1
## 56      16    0       1      0       1
## 57      16    0       1      0       1
## 58      16    0       1      1       0
## 59      17    1       1      1       1
## 60      17    0       1      0       1
## 61      17    0       1      1       1
## 62      17    0       1      0       0
## 63      17    0       0      1       0
## 64      18    1       1      0       1
## 65      18    0       1      0       1
## 66      18    0       0      1       0
## 67      18    0       0      1       0
## 68      18    0       1      0       0
## 69      18    0       1      0       1
## 70      19    1       0      0       0
## 71      19    0       1      0       0
## 72      19    0       0      0       0
## 73      19    0       0      0       0
## 74      19    0       0      1       0
## 75      19    0       0      0       0
## 76      20    1       1      1       1
## 77      20    0       0      0       0
## 78      20    0       0      0       0
## 79      20    0       0      0       0
## 80      20    0       1      0       0
## 81      20    0       0      0       0
## 82      21    1       1      1       1
## 83      21    0       1      1       1
## 84      21    0       1      1       1
## 85      21    0       1      0       1
## 86      21    0       1      1       1
## 87      21    0       1      1       1
## 88      21    0       1      1       1
## 89      22    1       0      0       1
## 90      22    0       1      1       1
## 91      22    0       0      0       1
## 92      22    0       1      1       0
## 93      22    0       0      0       0
## 94      22    0       0      1       0
## 95      22    0       1      0       0
## 96      23    1       1      1       1
## 97      23    0       1      1       1
## 98      23    0       1      1       1
## 99      23    0       1      1       0
## 100     23    0       1      1       1
## 101     23    0       1      1       0
## 102     23    0       1      1       0
## 103     24    1       1      1       1
## 104     24    0       1      0       0
## 105     24    0       1      0       0
## 106     24    0       0      1       1
## 107     24    0       1      0       0
## 108     24    0       1      0       1
## 109     24    0       1      0       0
## 110     25    1       0      0       0
## 111     25    0       1      1       0
## 112     25    0       1      1       1
## 113     25    0       1      0       1
## 114     25    0       1      0       0
## 115     26    1       1      0       1
## 116     26    0       0      0       0
## 117     26    0       1      1       0
## 118     26    0       0      0       0
## 119     26    0       1      1       1
# what is the effect of smoking?
matchTab(VC1to6$case, VC1to6$smoking, strata = VC1to6$matset) # Is smoking exposure associated with cancer?
## 
## Exposure status: $ = 1 
##  
##  Exposure status: VC1to6 = 1 
##  
##  Exposure status: smoking = 1 
##  
## Total number of match sets in the tabulation = 26 
##  
## Number of controls = 1 
##                     No. of controls exposed
## No. of cases exposed 0 1
##                    0 0 0
##                    1 0 3
## 
## Number of controls = 2 
##                     No. of controls exposed
## No. of cases exposed 0 1 2
##                    0 0 0 1
##                    1 1 1 0
## 
## Number of controls = 3 
##                     No. of controls exposed
## No. of cases exposed 0 1 2 3
##                    0 0 0 1 0
##                    1 1 0 4 1
## 
## Number of controls = 4 
##                     No. of controls exposed
## No. of cases exposed 0 1 2 3 4
##                    0 0 0 0 0 1
##                    1 0 0 1 4 0
## 
## Number of controls = 5 
##                     No. of controls exposed
## No. of cases exposed 0 1 2 3 4 5
##                    0 0 1 0 0 0 0
##                    1 0 1 0 1 0 0
## 
## Number of controls = 6 
##                     No. of controls exposed
## No. of cases exposed 0 1 2 3 4 5 6
##                    0 0 0 0 1 0 0 0
##                    1 0 0 0 0 0 1 2
## 
## Odds ratio by Mantel-Haenszel method = 1.988 
##  
## Odds ratio by maximum likelihood estimate (MLE) method = 2.066 
##  95%CI= 0.678 , 6.301 
## 
matchTab(VC1to6$case, VC1to6$alcohol, strata = VC1to6$matset)
## 
## Exposure status: $ = 1 
##  
##  Exposure status: VC1to6 = 1 
##  
##  Exposure status: alcohol = 1 
##  
## Total number of match sets in the tabulation = 26 
##  
## Number of controls = 1 
##                     No. of controls exposed
## No. of cases exposed 0 1
##                    0 2 0
##                    1 1 0
## 
## Number of controls = 2 
##                     No. of controls exposed
## No. of cases exposed 0 1 2
##                    0 0 0 1
##                    1 2 0 0
## 
## Number of controls = 3 
##                     No. of controls exposed
## No. of cases exposed 0 1 2 3
##                    0 2 2 0 0
##                    1 1 1 1 0
## 
## Number of controls = 4 
##                     No. of controls exposed
## No. of cases exposed 0 1 2 3 4
##                    0 0 0 1 0 0
##                    1 0 2 2 1 0
## 
## Number of controls = 5 
##                     No. of controls exposed
## No. of cases exposed 0 1 2 3 4 5
##                    0 1 0 0 0 0 0
##                    1 1 0 1 0 0 0
## 
## Number of controls = 6 
##                     No. of controls exposed
## No. of cases exposed 0 1 2 3 4 5 6
##                    0 0 0 0 0 0 0 0
##                    1 0 0 2 1 0 0 1
## 
## Odds ratio by Mantel-Haenszel method = 5.386 
##  
## Odds ratio by maximum likelihood estimate (MLE) method = 5.655 
##  95%CI= 1.811 , 17.659 
## 

Conditional logistic regression

# conditional logistic reg using clogit from survival package
### load the survival package
library(survival)
clogit1 <- clogit(case ~ smoking + alcohol + strata(matset), data = VC1to1) # 1 to 1 match dataset
summary(clogit1) # What covariates are associated?
## Call:
## coxph(formula = Surv(rep(1, 52L), case) ~ smoking + alcohol + 
##     strata(matset), data = VC1to1, method = "exact")
## 
##   n= 52, number of events= 26 
## 
##            coef exp(coef) se(coef)      z Pr(>|z|)  
## smoking -0.3142    0.7304   0.7079 -0.444   0.6572  
## alcohol  1.5715    4.8141   0.8030  1.957   0.0503 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##         exp(coef) exp(-coef) lower .95 upper .95
## smoking    0.7304     1.3691    0.1824     2.925
## alcohol    4.8141     0.2077    0.9978    23.227
## 
## Concordance= 0.673  (se = 0.101 )
## Likelihood ratio test= 5.02  on 2 df,   p=0.08
## Wald test            = 3.83  on 2 df,   p=0.1
## Score (logrank) test = 4.62  on 2 df,   p=0.1
clogit2 <- clogit(case ~ smoking + alcohol + strata(matset), data = VC1to6) # 1 to 6 match dataset
summary(clogit2) # What covariates are associated?
## Call:
## coxph(formula = Surv(rep(1, 119L), case) ~ smoking + alcohol + 
##     strata(matset), data = VC1to6, method = "exact")
## 
##   n= 119, number of events= 26 
## 
##           coef exp(coef) se(coef)     z Pr(>|z|)   
## smoking 0.3619    1.4361   0.6244 0.580  0.56217   
## alcohol 1.6552    5.2342   0.5899 2.806  0.00502 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##         exp(coef) exp(-coef) lower .95 upper .95
## smoking     1.436     0.6963    0.4224     4.883
## alcohol     5.234     0.1911    1.6471    16.633
## 
## Concordance= 0.694  (se = 0.065 )
## Likelihood ratio test= 11.49  on 2 df,   p=0.003
## Wald test            = 9.13  on 2 df,   p=0.01
## Score (logrank) test = 11.04  on 2 df,   p=0.004
# compute ORs
clogistic.display(clogit1)
## Conditional logistic regression predicting case : 1 vs 0 
## 
##  
##                  crude OR(95%CI)   adj. OR(95%CI)    P(Wald's test) P(LR-test)
## smoking: 1 vs 0  1 (0.29,3.45)     0.73 (0.18,2.92)  0.657          0.655     
##                                                                               
## alcohol: 1 vs 0  4.5 (0.97,20.83)  4.81 (1,23.23)    0.05           0.025     
##                                                                               
## No. of observations =  52
clogistic.display(clogit2)
## Conditional logistic regression predicting case : 1 vs 0 
## 
##  
##                  crude OR(95%CI)    adj. OR(95%CI)     P(Wald's test)
## smoking: 1 vs 0  2.07 (0.68,6.3)    1.44 (0.42,4.88)   0.562         
##                                                                      
## alcohol: 1 vs 0  5.66 (1.81,17.66)  5.23 (1.65,16.63)  0.005         
##                                                                      
##                  P(LR-test)
## smoking: 1 vs 0  0.559     
##                            
## alcohol: 1 vs 0  0.002     
##                            
## No. of observations =  119

6.6. Survival Analysis: Association between total mortality (d_total) and blood lead (bpb)

tab1(nhanes$d_total) # Variable for death due to any cause in NHANES

## nhanes$d_total : 
##         Frequency   %(NA+)   %(NA-)
## 0            3794     74.8     74.8
## 1            1275     25.1     25.2
## NaN             5      0.1      0.0
##   Total      5074    100.0    100.0
summ(nhanes$pmon_mec) # Number of months of follow up 

##  obs. mean   median  s.d.   min.   max.  
##  5069 158.7  170     49.481 0      217
### Define Surv() object to use in later functions
surv.total <- Surv(nhanes$pmon_mec, nhanes$d_total)
surv.total[1:10] # Includes information on time of follow up and whether the person died, denoted with a "+"
##  [1] 203+ 196+ 215+ 132  184+ 143  202+ 206+ 187+ 115
### K-M Life table and curve
fit.total <- survfit(Surv(nhanes$pmon_mec, nhanes$d_total) ~ 1) # Model with no predictors, just outcome
#summary(fit.total)
summary(nhanes$pmon_mec/12) # How many years of follow up do you have?
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00   12.42   14.17   13.22   16.08   18.08       5
fit.total # How many people died over this time period?
## Call: survfit(formula = Surv(nhanes$pmon_mec, nhanes$d_total) ~ 1)
## 
##    5 observations deleted due to missingness 
##       n  events  median 0.95LCL 0.95UCL 
##    5069    1275      NA      NA      NA
plot(fit.total)

## suppress 95% CI lines and the time marks for censored subjects.
plot(fit.total, ylim = c(0.7, 1.0), conf.int = F, mark.time = F, ylab="Probability of survival", xlab="Months")

### Survival by different levels of covariates
fit.total.sex <- survfit(Surv(nhanes$pmon_mec, nhanes$d_total) ~ nhanes$sex)
fit.total.sex # How many people died in each sex group?
## Call: survfit(formula = Surv(nhanes$pmon_mec, nhanes$d_total) ~ nhanes$sex)
## 
##    5 observations deleted due to missingness 
##                 n events median 0.95LCL 0.95UCL
## nhanes$sex=1 2332    679     NA      NA      NA
## nhanes$sex=2 2737    596     NA      NA      NA
#summary(fit.total.sex)[1:10]

plot(fit.total.sex, ylim = c(0.6, 1.0), col = c("blue", "red"), lty = c(1, 2), mark.time = F, main = "Kaplan-Meier curve", xlab = "Time (months)", ylab = "Survival probability")
legend("bottomleft", legend = c("Men", "Women"), col = c("blue", "red"),  lty = c(1, 2))

### Test for differences in survival curves
survdiff(Surv(nhanes$pmon_mec, nhanes$d_total) ~ nhanes$sex) # Is there a difference in survival among males and females?
## Call:
## survdiff(formula = Surv(nhanes$pmon_mec, nhanes$d_total) ~ nhanes$sex)
## 
## n=5069, 5 observations deleted due to missingness.
## 
##                 N Observed Expected (O-E)^2/E (O-E)^2/V
## nhanes$sex=1 2332      679      573      19.7      35.8
## nhanes$sex=2 2737      596      702      16.1      35.8
## 
##  Chisq= 35.8  on 1 degrees of freedom, p= 2e-09

Cox regression

cox.bpb <- coxph(Surv(nhanes$pmon_mec, nhanes$d_total) ~ nhanes$bpb)
summary(cox.bpb) # Is blood Pb associated with survival? Easier to visualize in categories of exposure
## Call:
## coxph(formula = Surv(nhanes$pmon_mec, nhanes$d_total) ~ nhanes$bpb)
## 
##   n= 5069, number of events= 1275 
##    (5 observations deleted due to missingness)
## 
##                coef exp(coef) se(coef)     z Pr(>|z|)    
## nhanes$bpb 0.070662  1.073219 0.005597 12.63   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## nhanes$bpb     1.073     0.9318     1.062     1.085
## 
## Concordance= 0.62  (se = 0.008 )
## Likelihood ratio test= 113.3  on 1 df,   p=<2e-16
## Wald test            = 159.4  on 1 df,   p=<2e-16
## Score (logrank) test = 153.9  on 1 df,   p=<2e-16
bpb3 <- cut2(nhanes$bpb, g = 3)
tab1(bpb3) # Tertiles of blood Pb

## bpb3 : 
##            Frequency Percent Cum. percent
## [0.7, 2.4)      1760    34.7         34.7
## [2.4, 4.4)      1672    33.0         67.6
## [4.4,48.1]      1642    32.4        100.0
##   Total         5074   100.0        100.0
# K-M Life table and curve
fit.total.bpb3 <- survfit(Surv(nhanes$pmon_mec, nhanes$d_total) ~ bpb3)
fit.total.bpb3
## Call: survfit(formula = Surv(nhanes$pmon_mec, nhanes$d_total) ~ bpb3)
## 
##    5 observations deleted due to missingness 
##                    n events median 0.95LCL 0.95UCL
## bpb3=[0.7, 2.4) 1758    248     NA      NA      NA
## bpb3=[2.4, 4.4) 1670    451     NA      NA      NA
## bpb3=[4.4,48.1] 1641    576     NA      NA      NA
plot(fit.total.bpb3, col = c(1:3), lty = c(1:3), ylim = c(0.6, 1.0), main = "Survival in relation to blood lead levels", xlab = "Time (months)", ylab = "Survival probability") 
legend(30, 0.7, legend = c("Q1", "Q2", "Q3"), lty = c(1:3), col = c(1:3))

# crude
cox.bpb3 <- coxph(Surv(nhanes$pmon_mec, nhanes$d_total) ~ bpb3)
summary(cox.bpb3)
## Call:
## coxph(formula = Surv(nhanes$pmon_mec, nhanes$d_total) ~ bpb3)
## 
##   n= 5069, number of events= 1275 
##    (5 observations deleted due to missingness)
## 
##                   coef exp(coef) se(coef)      z Pr(>|z|)    
## bpb3[2.4, 4.4) 0.70388   2.02158  0.07906  8.903   <2e-16 ***
## bpb3[4.4,48.1] 1.00452   2.73061  0.07599 13.219   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                exp(coef) exp(-coef) lower .95 upper .95
## bpb3[2.4, 4.4)     2.022     0.4947     1.731     2.360
## bpb3[4.4,48.1]     2.731     0.3662     2.353     3.169
## 
## Concordance= 0.605  (se = 0.007 )
## Likelihood ratio test= 196.5  on 2 df,   p=<2e-16
## Wald test            = 174.7  on 2 df,   p=<2e-16
## Score (logrank) test = 186.5  on 2 df,   p=<2e-16
# adjusted
cox.bpb3.adj <- coxph(Surv(nhanes$pmon_mec, nhanes$d_total) ~ bpb3 + nhanes$age + factor(nhanes$sex) + factor(nhanes$race) + factor(nhanes$educ) + factor(nhanes$smk) + factor(nhanes$alc))
summary(cox.bpb3.adj)
## Call:
## coxph(formula = Surv(nhanes$pmon_mec, nhanes$d_total) ~ bpb3 + 
##     nhanes$age + factor(nhanes$sex) + factor(nhanes$race) + factor(nhanes$educ) + 
##     factor(nhanes$smk) + factor(nhanes$alc))
## 
##   n= 4905, number of events= 1211 
##    (169 observations deleted due to missingness)
## 
##                           coef exp(coef)  se(coef)      z Pr(>|z|)    
## bpb3[2.4, 4.4)        0.069004  1.071440  0.081665  0.845  0.39813    
## bpb3[4.4,48.1]        0.084549  1.088226  0.083037  1.018  0.30858    
## nhanes$age            0.088495  1.092529  0.002373 37.287  < 2e-16 ***
## factor(nhanes$sex)2  -0.294962  0.744560  0.065259 -4.520 6.19e-06 ***
## factor(nhanes$race)2  0.178596  1.195537  0.070922  2.518  0.01180 *  
## factor(nhanes$educ)2  0.001651  1.001652  0.062588  0.026  0.97895    
## factor(nhanes$educ)3 -0.205472  0.814263  0.102174 -2.011  0.04432 *  
## factor(nhanes$smk)2   0.171628  1.187237  0.070372  2.439  0.01473 *  
## factor(nhanes$smk)3   0.631970  1.881314  0.082193  7.689 1.48e-14 ***
## factor(nhanes$alc)1  -0.182403  0.833265  0.067767 -2.692  0.00711 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                      exp(coef) exp(-coef) lower .95 upper .95
## bpb3[2.4, 4.4)          1.0714     0.9333    0.9130    1.2574
## bpb3[4.4,48.1]          1.0882     0.9189    0.9248    1.2806
## nhanes$age              1.0925     0.9153    1.0875    1.0976
## factor(nhanes$sex)2     0.7446     1.3431    0.6552    0.8462
## factor(nhanes$race)2    1.1955     0.8364    1.0404    1.3738
## factor(nhanes$educ)2    1.0017     0.9984    0.8860    1.1324
## factor(nhanes$educ)3    0.8143     1.2281    0.6665    0.9948
## factor(nhanes$smk)2     1.1872     0.8423    1.0343    1.3628
## factor(nhanes$smk)3     1.8813     0.5315    1.6014    2.2102
## factor(nhanes$alc)1     0.8333     1.2001    0.7296    0.9516
## 
## Concordance= 0.857  (se = 0.005 )
## Likelihood ratio test= 2351  on 10 df,   p=<2e-16
## Wald test            = 1629  on 10 df,   p=<2e-16
## Score (logrank) test = 2328  on 10 df,   p=<2e-16
### Test for the proportional hazards assumption
test.prop <- cox.zph(cox.bpb3.adj) # Do any of the variables violate the proportional hazards assumption? May consider stratifying by sex.
test.prop 
##                      chisq df     p
## bpb3                0.7663  2 0.682
## nhanes$age          1.3879  1 0.239
## factor(nhanes$sex)  4.4387  1 0.035
## factor(nhanes$race) 0.3733  1 0.541
## factor(nhanes$educ) 0.1618  2 0.922
## factor(nhanes$smk)  0.4093  2 0.815
## factor(nhanes$alc)  0.0235  1 0.878
## GLOBAL              8.0277 10 0.626
## Display a graph of the scaled Schoenfeld residuals, along with a smooth curve
plot(test.prop) # for all variables

plot(test.prop, var = 4) + # Can call up variables indivdually, here for sex
abline(h = 0, lty = 3, col = 2)

## integer(0)
# Stratify by sex
cox.bpb3.adj1 <- coxph(Surv(nhanes$pmon_mec, nhanes$d_total) ~ bpb3 + nhanes$age + strata(nhanes$sex) + factor(nhanes$race) + factor(nhanes$educ) + factor(nhanes$smk) + factor(nhanes$alc))
summary(cox.bpb3.adj1)
## Call:
## coxph(formula = Surv(nhanes$pmon_mec, nhanes$d_total) ~ bpb3 + 
##     nhanes$age + strata(nhanes$sex) + factor(nhanes$race) + factor(nhanes$educ) + 
##     factor(nhanes$smk) + factor(nhanes$alc))
## 
##   n= 4905, number of events= 1211 
##    (169 observations deleted due to missingness)
## 
##                           coef exp(coef)  se(coef)      z Pr(>|z|)    
## bpb3[2.4, 4.4)        0.069557  1.072033  0.081702  0.851  0.39458    
## bpb3[4.4,48.1]        0.088475  1.092506  0.083082  1.065  0.28692    
## nhanes$age            0.088456  1.092486  0.002374 37.260  < 2e-16 ***
## factor(nhanes$race)2  0.178662  1.195616  0.070942  2.518  0.01179 *  
## factor(nhanes$educ)2  0.001237  1.001238  0.062599  0.020  0.98424    
## factor(nhanes$educ)3 -0.209755  0.810783  0.102180 -2.053  0.04009 *  
## factor(nhanes$smk)2   0.166505  1.181170  0.070394  2.365  0.01801 *  
## factor(nhanes$smk)3   0.631710  1.880825  0.082187  7.686 1.51e-14 ***
## factor(nhanes$alc)1  -0.179672  0.835545  0.067736 -2.653  0.00799 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                      exp(coef) exp(-coef) lower .95 upper .95
## bpb3[2.4, 4.4)          1.0720     0.9328    0.9134    1.2582
## bpb3[4.4,48.1]          1.0925     0.9153    0.9283    1.2857
## nhanes$age              1.0925     0.9153    1.0874    1.0976
## factor(nhanes$race)2    1.1956     0.8364    1.0404    1.3740
## factor(nhanes$educ)2    1.0012     0.9988    0.8856    1.1319
## factor(nhanes$educ)3    0.8108     1.2334    0.6636    0.9906
## factor(nhanes$smk)2     1.1812     0.8466    1.0289    1.3559
## factor(nhanes$smk)3     1.8808     0.5317    1.6010    2.2096
## factor(nhanes$alc)1     0.8355     1.1968    0.7317    0.9542
## 
## Concordance= 0.855  (se = 0.005 )
## Likelihood ratio test= 2315  on 9 df,   p=<2e-16
## Wald test            = 1599  on 9 df,   p=<2e-16
## Score (logrank) test = 2294  on 9 df,   p=<2e-16
cox.zph(cox.bpb3.adj1)
##                     chisq df    p
## bpb3                0.443  2 0.80
## nhanes$age          1.321  1 0.25
## factor(nhanes$race) 0.404  1 0.53
## factor(nhanes$educ) 0.156  2 0.93
## factor(nhanes$smk)  1.038  2 0.60
## factor(nhanes$alc)  0.476  1 0.49
## GLOBAL              3.675  9 0.93

6.7. Write your own functions

## Let's make a simple macro that calculates the mean and standard deviation at the same time.

# get the results in vector form
mystats <- function(x) {
  mymean <- mean(x, na.rm = T)
  mysd <- sd(x, na.rm = T)
  c(mean = mymean, sd = mysd)
}

mystats(nhanes$age)
##     mean       sd 
## 48.74359 19.29721
mystats(nhanes$bpb)
##     mean       sd 
## 3.958041 3.227670

Regression functions

# Assume that you are examining age-adjusted associations of SBP with from the 13th variables (bmi) to 25th variables (packyrs) (n=13)
# you want to run 13 linear regression models and save beta's and p-values

summ(nhanes)
## 
## No. of observations = 5074
## 
##    Var. name obs. mean     median  s.d.     min.   max.     
## 1  strata    5074 23.26    22      13.49    1      49       
## 2  seqn      5074 27086.17 32365.5 17087.06 3      53594    
## 3  race      5074 1.28     1       0.45     1      2        
## 4  sex       5074 1.54     2       0.5      1      2        
## 5  age       5074 48.74    46      19.3     20     90       
## 6  urban     5074 1.52     2       0.5      1      2        
## 7  region    5074 2.75     3       0.95     1      4        
## 8  pir       4587 2.46     2.01    1.79     0      11.89    
## 9  psu       5074 1.51     2       0.5      1      2        
## 10 wt_mh     5074 10416.3  4495.92 13014    225.93 139744.91
## 11 sbp       5074 126.26   122     19.96    80     237      
## 12 dbp       5074 74.52    74      10.57    32     134      
## 13 bmi       5064 27.24    26.4    5.82     14.4   68.5     
## 14 hematoc   5013 41.43    41.45   4.17     19.1   57.35    
## 15 bpb       5074 3.96     3.2     3.23     0.7    48.1     
## 16 chol      5016 207.21   204     45.5     83     676      
## 17 trig      5006 147.21   115     122.71   22     3616     
## 18 scalc     4942 9.27     9.3     0.43     7.3    13.9     
## 19 creat     4942 1.09     1       0.4      0.4    13.9     
## 20 calc      5074 50852.16 50049   25961.73 6      90842    
## 21 sodium    5074 8121.3   3155    13206.1  0      99712    
## 22 potass    5074 854.65   0.55    8176.14  0.01   97800    
## 23 educ      5041 1.73     2       0.68     1      3        
## 24 smk       5074 1.75     1       0.83     1      3        
## 25 packyrs   4856 9.58     0       22.02    0      456      
## 26 diag_ca   5074 0.07     0       0.26     0      1        
## 27 diag_dm   5065 0.09     0       0.29     0      1        
## 28 diag_ht   5035 0.28     0       0.45     0      1        
## 29 alc       4942 0.44     0       0.5      0      1        
## 30 phyact    5074 0.77     1       0.42     0      1        
## 31 med_ht    5029 0.17     0       0.37     0      1        
## 32 htn       5074 0.38     0       0.49     0      1        
## 33 d_total   5069 0.25     0       0.43     0      1        
## 34 pmon_int  5069 159.6    171     49.49    1      218      
## 35 pmon_mec  5069 158.7    170     49.48    0      217      
## 36 d_cancer  5069 0.05     0       0.22     0      1        
## 37 d_cvd     5069 0.11     0       0.32     0      1        
## 38 sex1      5074 1.54     2       0.498    1      2        
## 39 AGE5b     5074 2.942    3       1.425    1      5        
## 40 AGE5c     5074 2.516    2       1.55     1      5        
## 41 age5c     5074 2.52     2       1.55     1      5
test.var <- nhanes[, c(13:25)]
head(test.var)
##    bmi hematoc  bpb chol trig scalc creat  calc sodium    potass educ smk
## 1 25.5   43.80  5.0  268  174   9.5   1.1 70033    460 0.3099999    2   1
## 2 29.4   46.70  2.0  225  109   9.4   1.2 70385  38591 0.5103998    3   1
## 3 44.4   49.80  6.0  162   89   9.6   1.2 40040   3050 0.5599999    1   3
## 4 37.5   47.70 15.5  212  479  10.6   1.0 10003   1321 0.8099999    1   3
## 5 23.6   40.60 11.3  202   96   9.3   1.0 40021   5295 0.6499996    1   1
## 6 31.1   37.85  7.2  186  300   8.8   1.3 80005    831 0.7099996    1   3
##   packyrs
## 1     0.0
## 2     0.0
## 3    10.0
## 4     2.0
## 5     0.0
## 6    17.5
# example
mod <- lm(sbp ~ bmi + age, data = nhanes, na.action = na.omit)
summary(mod)
## 
## Call:
## lm(formula = sbp ~ bmi + age, data = nhanes, na.action = na.omit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -58.828 -10.192  -1.445   8.221  98.816 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 82.71612    1.17680   70.29   <2e-16 ***
## bmi          0.50219    0.03795   13.23   <2e-16 ***
## age          0.61277    0.01146   53.47   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.71 on 5061 degrees of freedom
##   (10 observations deleted due to missingness)
## Multiple R-squared:  0.3793, Adjusted R-squared:  0.3791 
## F-statistic:  1547 on 2 and 5061 DF,  p-value: < 2.2e-16
summary(mod)$coef[2, 1] # how to extract beta for bmi
## [1] 0.5021861
summary(mod)$coef[2, 4] # how to extract p-value for bmi
## [1] 2.500308e-39
RegressResults <- function(data, y, cov) {
  nvar <- ncol(data)
  newdata <- data.frame(cbind(data, y, cov))

  tmatrix <- data.frame(matrix(NA, 2, nvar)) # 2 rows
  colnames(tmatrix) <- colnames(data) # create a row for each var
  rownames(tmatrix) <- c("beta", "p")

  for (i in 1:nvar) {
    ind <- data[, i]
    model <- lm(y ~ ind + cov, data = newdata, na.action = na.omit)
    tmatrix[1, i] <- summary(model)$coef[2, 1]
    tmatrix[2, i] <- summary(model)$coef[2, 4]
  }
  return(tmatrix)
}

reg_output <- RegressResults(test.var, nhanes$sbp, nhanes$age) # Run the function
write.csv(reg_output, file = file.path(directory, "sbp.results.csv")) # Look in your output directory for the beta coefficients and p-values for all covariates! 

Optional: Exercise 6C